by_speaker_scene <- lines %>%
count(scene, character)
by_speaker_scene## # A tibble: 162 x 3
## # Groups: scene [76]
## scene character n
## <int> <chr> <int>
## 1 2 Billy (Bill Nighy) 5
## 2 2 Joe (Gregor Fisher) 3
## 3 3 Jamie (Colin Firth) 5
## 4 4 Daniel (Liam Neeson) 3
## 5 4 Karen (Emma Thompson) 6
## 6 5 Colin (Kris Marshall) 4
## 7 6 Jack (Martin Freeman) 2
## 8 6 Judy (Joanna Page) 1
## 9 7 Mark (Andrew Lincoln) 4
## 10 7 Peter (Chiwetel Ejiofor) 4
## # ... with 152 more rows
library(reshape2)
speaker_scene_matrix <- by_speaker_scene %>%
acast(character ~ scene, fun.aggregate = length)
speaker_scene_matrix[1:5, 1:5]## 2 3 4 5 6
## Aurelia (Lúcia Moniz) 0 0 0 0 0
## Billy (Bill Nighy) 1 0 0 0 0
## Colin (Kris Marshall) 0 0 0 1 0
## Daniel (Liam Neeson) 0 0 1 0 0
## Harry (Alan Rickman) 0 0 0 0 0
dim(speaker_scene_matrix)## [1] 20 76
norm <- speaker_scene_matrix / rowSums(speaker_scene_matrix)
h <- hclust(dist(norm, method = "manhattan"))
ggdendro::ggdendrogram(h)ordering <- h$labels[h$order]
ordering## [1] "Natalie (Martine McCutcheon)" "PM (Hugh Grant)"
## [3] "Aurelia (Lúcia Moniz)" "Jamie (Colin Firth)"
## [5] "Daniel (Liam Neeson)" "Sam (Thomas Sangster)"
## [7] "Jack (Martin Freeman)" "Judy (Joanna Page)"
## [9] "Colin (Kris Marshall)" "Tony (Abdul Salis)"
## [11] "Billy (Bill Nighy)" "Joe (Gregor Fisher)"
## [13] "Mark (Andrew Lincoln)" "Juliet (Keira Knightley)"
## [15] "Peter (Chiwetel Ejiofor)" "Karl (Rodrigo Santoro)"
## [17] "Sarah (Laura Linney)" "Mia (Heike Makatsch)"
## [19] "Harry (Alan Rickman)" "Karen (Emma Thompson)"
scenes <- by_speaker_scene %>%
filter(n() > 1) %>% # scenes with > 1 character
ungroup() %>%
mutate(scene = as.numeric(factor(scene)),
character = factor(character, levels = ordering))
ggplot(scenes, aes(scene, character)) +
geom_point() +
geom_path(aes(group = scene))non_airport_scenes <- speaker_scene_matrix[, colSums(speaker_scene_matrix) < 10]
non_airport_scenes[1:5, 1:5]## 2 3 4 5 6
## Aurelia (Lúcia Moniz) 0 0 0 0 0
## Billy (Bill Nighy) 1 0 0 0 0
## Colin (Kris Marshall) 0 0 0 1 0
## Daniel (Liam Neeson) 0 0 1 0 0
## Harry (Alan Rickman) 0 0 0 0 0
cooccur <- non_airport_scenes %*% t(non_airport_scenes)
cooccur[1:5, 1:5]## Aurelia (Lúcia Moniz) Billy (Bill Nighy)
## Aurelia (Lúcia Moniz) 5 0
## Billy (Bill Nighy) 0 6
## Colin (Kris Marshall) 0 0
## Daniel (Liam Neeson) 0 0
## Harry (Alan Rickman) 0 0
## Colin (Kris Marshall) Daniel (Liam Neeson)
## Aurelia (Lúcia Moniz) 0 0
## Billy (Bill Nighy) 0 0
## Colin (Kris Marshall) 6 0
## Daniel (Liam Neeson) 0 11
## Harry (Alan Rickman) 0 0
## Harry (Alan Rickman)
## Aurelia (Lúcia Moniz) 0
## Billy (Bill Nighy) 0
## Colin (Kris Marshall) 0
## Daniel (Liam Neeson) 0
## Harry (Alan Rickman) 10
heatmap(cooccur)cooccur %>%
as_tibble %>%
mutate(id1 = rownames(cooccur)) %>%
gather(id2, n, -id1) %>%
mutate_at(vars(id1, id2), funs(factor(., levels = ordering))) %>%
ggplot(aes(id1, id2, fill = n)) +
geom_tile() +
scale_fill_continuous(low = "white", high = "red") +
coord_fixed() +
labs(x = NULL,
y = NULL,
fill = NULL) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))g <- graph_from_adjacency_matrix(cooccur,
weighted = TRUE,
mode = "undirected",
diag = FALSE)
g## IGRAPH d83365e UNW- 20 37 --
## + attr: name (v/c), weight (e/n)
## + edges from d83365e (vertex names):
## [1] Aurelia (Lúcia Moniz)--Jamie (Colin Firth)
## [2] Billy (Bill Nighy) --Joe (Gregor Fisher)
## [3] Colin (Kris Marshall)--Mark (Andrew Lincoln)
## [4] Colin (Kris Marshall)--Tony (Abdul Salis)
## [5] Daniel (Liam Neeson) --Karen (Emma Thompson)
## [6] Daniel (Liam Neeson) --Sam (Thomas Sangster)
## [7] Harry (Alan Rickman) --Jamie (Colin Firth)
## [8] Harry (Alan Rickman) --Karen (Emma Thompson)
## + ... omitted several edges
ggraph(g) +
geom_edge_link(aes(edge_width = weight)) +
geom_node_point() +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
scale_edge_width_continuous(range = c(.5, 3)) +
theme_graph(base_size = base_size) +
theme(legend.position = "none")ggraph(g, layout = "linear") +
geom_edge_arc(aes(edge_width = weight)) +
geom_node_point() +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
scale_edge_width_continuous(range = c(.5, 3)) +
theme_graph(base_size = base_size) +
theme(legend.position = "none") +
ggtitle("Linear algorithm")ggraph(g, layout = "linear", circular = TRUE) +
geom_edge_arc(aes(edge_width = weight)) +
geom_node_point() +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
scale_edge_width_continuous(range = c(.5, 3)) +
theme_graph(base_size = base_size) +
theme(legend.position = "none") +
ggtitle("Star algorithm (circular)")